In [1]:
import pandas as pd
import numpy as num
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as pit
import warnings

warnings.filterwarnings('ignore')

Load the Dataset¶

In [2]:
cd = pd.read_csv("D:/CRICKET DATASET/Crickets data.csv")
cd
Out[2]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over ... bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs player_dismissed dismissal_kind fielder
0 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 1 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
1 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 2 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
2 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 3 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 4 0 4 NaN NaN NaN
3 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 4 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 0 0 NaN NaN NaN
4 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 5 DA Warner S Dhawan TS Mills 0 ... 0 0 0 0 0 2 2 NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
150455 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 2 Sachin Baby CJ Jordan B Kumar 0 ... 0 0 0 0 2 0 2 NaN NaN NaN
150456 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 3 Sachin Baby CJ Jordan B Kumar 0 ... 0 0 0 0 0 0 0 CJ Jordan run out NV Ojha
150457 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 4 Iqbal Abdulla Sachin Baby B Kumar 0 ... 0 1 0 0 0 1 1 NaN NaN NaN
150458 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 5 Sachin Baby Iqbal Abdulla B Kumar 0 ... 0 0 0 0 1 0 1 NaN NaN NaN
150459 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 6 Iqbal Abdulla Sachin Baby B Kumar 0 ... 0 0 0 0 4 0 4 NaN NaN NaN

150460 rows × 21 columns

In [3]:
dd = pd.read_csv("D:/kk/delivers match.csv")
dd
Out[3]:
id season city date team1 team2 toss_winner toss_decision result dl_applied winner win_by_runs win_by_wickets player_of_match venue umpire1 umpire2 umpire3
0 1 2017 Hyderabad 2017-04-05 Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Sunrisers Hyderabad 35 0 Yuvraj Singh Rajiv Gandhi International Stadium, Uppal AY Dandekar NJ Llong NaN
1 2 2017 Pune 2017-04-06 Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal 0 Rising Pune Supergiant 0 7 SPD Smith Maharashtra Cricket Association Stadium A Nand Kishore S Ravi NaN
2 3 2017 Rajkot 2017-04-07 Gujarat Lions Kolkata Knight Riders Kolkata Knight Riders field normal 0 Kolkata Knight Riders 0 10 CA Lynn Saurashtra Cricket Association Stadium Nitin Menon CK Nandan NaN
3 4 2017 Indore 2017-04-08 Rising Pune Supergiant Kings XI Punjab Kings XI Punjab field normal 0 Kings XI Punjab 0 6 GJ Maxwell Holkar Cricket Stadium AK Chaudhary C Shamshuddin NaN
4 5 2017 Bangalore 2017-04-08 Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal 0 Royal Challengers Bangalore 15 0 KM Jadhav M Chinnaswamy Stadium NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
631 632 2016 Raipur 2016-05-22 Delhi Daredevils Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Royal Challengers Bangalore 0 6 V Kohli Shaheed Veer Narayan Singh International Stadium A Nand Kishore BNJ Oxenford NaN
632 633 2016 Bangalore 2016-05-24 Gujarat Lions Royal Challengers Bangalore Royal Challengers Bangalore field normal 0 Royal Challengers Bangalore 0 4 AB de Villiers M Chinnaswamy Stadium AK Chaudhary HDPK Dharmasena NaN
633 634 2016 Delhi 2016-05-25 Sunrisers Hyderabad Kolkata Knight Riders Kolkata Knight Riders field normal 0 Sunrisers Hyderabad 22 0 MC Henriques Feroz Shah Kotla M Erasmus C Shamshuddin NaN
634 635 2016 Delhi 2016-05-27 Gujarat Lions Sunrisers Hyderabad Sunrisers Hyderabad field normal 0 Sunrisers Hyderabad 0 4 DA Warner Feroz Shah Kotla M Erasmus CK Nandan NaN
635 636 2016 Bangalore 2016-05-29 Sunrisers Hyderabad Royal Challengers Bangalore Sunrisers Hyderabad bat normal 0 Sunrisers Hyderabad 8 0 BCJ Cutting M Chinnaswamy Stadium HDPK Dharmasena BNJ Oxenford NaN

636 rows × 18 columns

Summary¶

2 files

39 columns

Inspect the dataset¶

In [222]:
cd.shape
Out[222]:
(150460, 21)
In [223]:
dd.shape
Out[223]:
(636, 18)
In [224]:
cd = cd.drop(['player_dismissed','fielder','dismissal_kind'],axis=1)
cd
Out[224]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
0 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 1 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
1 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 2 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
2 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 3 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 4 0 4
3 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 4 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
4 1 1 Sunrisers Hyderabad Royal Challengers Bangalore 1 5 DA Warner S Dhawan TS Mills 0 2 0 0 0 0 0 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
150455 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 2 Sachin Baby CJ Jordan B Kumar 0 0 0 0 0 0 2 0 2
150456 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 3 Sachin Baby CJ Jordan B Kumar 0 0 0 0 0 0 0 0 0
150457 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 4 Iqbal Abdulla Sachin Baby B Kumar 0 0 0 1 0 0 0 1 1
150458 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 5 Sachin Baby Iqbal Abdulla B Kumar 0 0 0 0 0 0 1 0 1
150459 636 2 Royal Challengers Bangalore Sunrisers Hyderabad 20 6 Iqbal Abdulla Sachin Baby B Kumar 0 0 0 0 0 0 4 0 4

150460 rows × 18 columns

In [225]:
cd.shape
Out[225]:
(150460, 18)
In [226]:
dd = dd.drop(['id','dl_applied','date','venue','umpire1','umpire2','umpire3'],axis=1)
dd
Out[226]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
0 2017 Hyderabad Sunrisers Hyderabad Royal Challengers Bangalore Royal Challengers Bangalore field normal Sunrisers Hyderabad 35 0 Yuvraj Singh
1 2017 Pune Mumbai Indians Rising Pune Supergiant Rising Pune Supergiant field normal Rising Pune Supergiant 0 7 SPD Smith
2 2017 Rajkot Gujarat Lions Kolkata Knight Riders Kolkata Knight Riders field normal Kolkata Knight Riders 0 10 CA Lynn
3 2017 Indore Rising Pune Supergiant Kings XI Punjab Kings XI Punjab field normal Kings XI Punjab 0 6 GJ Maxwell
4 2017 Bangalore Royal Challengers Bangalore Delhi Daredevils Royal Challengers Bangalore bat normal Royal Challengers Bangalore 15 0 KM Jadhav
... ... ... ... ... ... ... ... ... ... ... ...
631 2016 Raipur Delhi Daredevils Royal Challengers Bangalore Royal Challengers Bangalore field normal Royal Challengers Bangalore 0 6 V Kohli
632 2016 Bangalore Gujarat Lions Royal Challengers Bangalore Royal Challengers Bangalore field normal Royal Challengers Bangalore 0 4 AB de Villiers
633 2016 Delhi Sunrisers Hyderabad Kolkata Knight Riders Kolkata Knight Riders field normal Sunrisers Hyderabad 22 0 MC Henriques
634 2016 Delhi Gujarat Lions Sunrisers Hyderabad Sunrisers Hyderabad field normal Sunrisers Hyderabad 0 4 DA Warner
635 2016 Bangalore Sunrisers Hyderabad Royal Challengers Bangalore Sunrisers Hyderabad bat normal Sunrisers Hyderabad 8 0 BCJ Cutting

636 rows × 11 columns

In [227]:
dd.shape
Out[227]:
(636, 11)

Get the summary of your dataset¶

In [228]:
cd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150460 entries, 0 to 150459
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   match_id       150460 non-null  int64 
 1   inning         150460 non-null  int64 
 2   batting_team   150460 non-null  object
 3   bowling_team   150460 non-null  object
 4   over           150460 non-null  int64 
 5   ball           150460 non-null  int64 
 6   batsman        150460 non-null  object
 7   non_striker    150460 non-null  object
 8   bowler         150460 non-null  object
 9   is_super_over  150460 non-null  int64 
 10  wide_runs      150460 non-null  int64 
 11  bye_runs       150460 non-null  int64 
 12  legbye_runs    150460 non-null  int64 
 13  noball_runs    150460 non-null  int64 
 14  penalty_runs   150460 non-null  int64 
 15  batsman_runs   150460 non-null  int64 
 16  extra_runs     150460 non-null  int64 
 17  total_runs     150460 non-null  int64 
dtypes: int64(13), object(5)
memory usage: 20.7+ MB

Identify missing values¶¶

In [229]:
cd.isnull().sum()
Out[229]:
match_id         0
inning           0
batting_team     0
bowling_team     0
over             0
ball             0
batsman          0
non_striker      0
bowler           0
is_super_over    0
wide_runs        0
bye_runs         0
legbye_runs      0
noball_runs      0
penalty_runs     0
batsman_runs     0
extra_runs       0
total_runs       0
dtype: int64
In [230]:
dd.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 636 entries, 0 to 635
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   season           636 non-null    int64 
 1   city             629 non-null    object
 2   team1            636 non-null    object
 3   team2            636 non-null    object
 4   toss_winner      636 non-null    object
 5   toss_decision    636 non-null    object
 6   result           636 non-null    object
 7   winner           633 non-null    object
 8   win_by_runs      636 non-null    int64 
 9   win_by_wickets   636 non-null    int64 
 10  player_of_match  633 non-null    object
dtypes: int64(3), object(8)
memory usage: 54.8+ KB

Identify missing values¶¶

In [231]:
dd.isnull().sum()
Out[231]:
season             0
city               7
team1              0
team2              0
toss_winner        0
toss_decision      0
result             0
winner             3
win_by_runs        0
win_by_wickets     0
player_of_match    3
dtype: int64

List all column names¶

In [232]:
cd.columns
Out[232]:
Index(['match_id', 'inning', 'batting_team', 'bowling_team', 'over', 'ball',
       'batsman', 'non_striker', 'bowler', 'is_super_over', 'wide_runs',
       'bye_runs', 'legbye_runs', 'noball_runs', 'penalty_runs',
       'batsman_runs', 'extra_runs', 'total_runs'],
      dtype='object')
In [233]:
dd.columns
Out[233]:
Index(['season', 'city', 'team1', 'team2', 'toss_winner', 'toss_decision',
       'result', 'winner', 'win_by_runs', 'win_by_wickets', 'player_of_match'],
      dtype='object')
In [234]:
cd['batting_team'] = cd['batting_team'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
cd['batting_team']
Out[234]:
0          SH
1          SH
2          SH
3          SH
4          SH
         ... 
150455    RCB
150456    RCB
150457    RCB
150458    RCB
150459    RCB
Name: batting_team, Length: 150460, dtype: object
In [235]:
cd['bowling_team'] = cd['bowling_team'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
cd['bowling_team']
Out[235]:
0         RCB
1         RCB
2         RCB
3         RCB
4         RCB
         ... 
150455     SH
150456     SH
150457     SH
150458     SH
150459     SH
Name: bowling_team, Length: 150460, dtype: object
In [236]:
cd.head()
Out[236]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
0 1 1 SH RCB 1 1 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
1 1 1 SH RCB 1 2 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
2 1 1 SH RCB 1 3 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 4 0 4
3 1 1 SH RCB 1 4 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0 0 0
4 1 1 SH RCB 1 5 DA Warner S Dhawan TS Mills 0 2 0 0 0 0 0 2 2
In [237]:
cd.tail()
Out[237]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
150455 636 2 RCB SH 20 2 Sachin Baby CJ Jordan B Kumar 0 0 0 0 0 0 2 0 2
150456 636 2 RCB SH 20 3 Sachin Baby CJ Jordan B Kumar 0 0 0 0 0 0 0 0 0
150457 636 2 RCB SH 20 4 Iqbal Abdulla Sachin Baby B Kumar 0 0 0 1 0 0 0 1 1
150458 636 2 RCB SH 20 5 Sachin Baby Iqbal Abdulla B Kumar 0 0 0 0 0 0 1 0 1
150459 636 2 RCB SH 20 6 Iqbal Abdulla Sachin Baby B Kumar 0 0 0 0 0 0 4 0 4
In [238]:
dd['team1'] = dd['team1'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
dd['team1']
Out[238]:
0       SH
1       MI
2       GL
3      RPS
4      RCB
      ... 
631     DD
632     GL
633     SH
634     GL
635     SH
Name: team1, Length: 636, dtype: object
In [239]:
dd['team2'] = dd['team2'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
dd['team2']
Out[239]:
0       RCB
1       RPS
2       KKR
3      KXIP
4        DD
       ... 
631     RCB
632     RCB
633     KKR
634      SH
635     RCB
Name: team2, Length: 636, dtype: object
In [240]:
dd['toss_winner'] = dd['toss_winner'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
dd['toss_winner']
Out[240]:
0       RCB
1       RPS
2       KKR
3      KXIP
4       RCB
       ... 
631     RCB
632     RCB
633     KKR
634      SH
635      SH
Name: toss_winner, Length: 636, dtype: object
In [241]:
dd['winner'] = dd['winner'].map({'Chennai Super Kings':'CSK',
                                             'Deccan Chargers':'DC',
                                             'Delhi Daredevils':'DD',
                                             'Gujarat Lions':'GL',
                                             'Kings XI Punjab':'KXIP',
                                             'Kochi Tuskers Kerala':'KTK',
                                             'Kolkata Knight Riders':'KKR',
                                             'Mumbai Indians':'MI',
                                             'Pune Warriors':'PW',
                                             'Rajasthan Royals':'RR',
                                             'Rising Pune Supergiant':'RPS',
                                             'Rising Pune Supergiants':'RPSS',
                                             'Royal Challengers Bangalore':'RCB',
                                             'Sunrisers Hyderabad':'SH'})
dd['winner']
Out[241]:
0        SH
1       RPS
2       KKR
3      KXIP
4       RCB
       ... 
631     RCB
632     RCB
633      SH
634      SH
635      SH
Name: winner, Length: 636, dtype: object
In [242]:
dd.head()
Out[242]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
0 2017 Hyderabad SH RCB RCB field normal SH 35 0 Yuvraj Singh
1 2017 Pune MI RPS RPS field normal RPS 0 7 SPD Smith
2 2017 Rajkot GL KKR KKR field normal KKR 0 10 CA Lynn
3 2017 Indore RPS KXIP KXIP field normal KXIP 0 6 GJ Maxwell
4 2017 Bangalore RCB DD RCB bat normal RCB 15 0 KM Jadhav
In [216]:
dd.tail()
Out[216]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
631 2016 Raipur DD RCB RCB field normal RCB 0 6 V Kohli
632 2016 Bangalore GL RCB RCB field normal RCB 0 4 AB de Villiers
633 2016 Delhi SH KKR KKR field normal SH 22 0 MC Henriques
634 2016 Delhi GL SH SH field normal SH 0 4 DA Warner
635 2016 Bangalore SH RCB SH bat normal SH 8 0 BCJ Cutting

Get basic statistical details¶

In [143]:
cd.describe()
Out[143]:
match_id inning over ball is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
count 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000 150460.000000
mean 318.281317 1.482188 10.142649 3.616483 0.000538 0.037498 0.004885 0.022232 0.004340 0.000066 1.222445 0.069022 1.291466
std 182.955531 0.501768 5.674338 1.807698 0.023196 0.257398 0.114234 0.200104 0.072652 0.018229 1.594509 0.349667 1.583240
min 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 161.000000 1.000000 5.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
50% 319.000000 1.000000 10.000000 4.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000
75% 476.000000 2.000000 15.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000 1.000000
max 636.000000 4.000000 20.000000 9.000000 1.000000 5.000000 4.000000 5.000000 5.000000 5.000000 6.000000 7.000000 7.000000
In [144]:
dd.describe()
Out[144]:
season win_by_runs win_by_wickets
count 636.000000 636.000000 636.000000
mean 2012.490566 13.682390 3.372642
std 2.773026 23.908877 3.420338
min 2008.000000 0.000000 0.000000
25% 2010.000000 0.000000 0.000000
50% 2012.000000 0.000000 4.000000
75% 2015.000000 20.000000 7.000000
max 2017.000000 146.000000 10.000000
In [146]:
cd.boxplot(column =["inning","over","ball","is_super_over","wide_runs","bye_runs","legbye_runs","noball_runs","penalty_runs","batsman_runs","extra_runs","total_runs"])
plt.xticks(rotation=90)
plt.title('Finding any outliners')
plt.show()
No description has been provided for this image
In [147]:
cd['total_runs'].describe()
Out[147]:
count    150460.000000
mean          1.291466
std           1.583240
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           7.000000
Name: total_runs, dtype: float64
In [148]:
import numpy as np
Q1=cd['total_runs'].quantile(0.25)
Q3=cd['total_runs'].quantile(0.75)
IQR=Q3-Q1

lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")

median_value = cd['total_runs'].median()
print(f"Median value: {median_value}")

cd['total_runs']=np.where((cd['total_runs']< lower_bound) | (cd['total_runs']> Upper_bound),median_value,cd['total_runs'])
Lower Bound: -1.5
Upper Bound: 2.5
Median value: 1.0
In [149]:
cd.boxplot(column = ['total_runs'])
plt.title('Total_runs')
plt.show()
No description has been provided for this image
In [150]:
cd.shape
Out[150]:
(150460, 18)
In [151]:
cd['batsman_runs'].describe()
Out[151]:
count    150460.000000
mean          1.222445
std           1.594509
min           0.000000
25%           0.000000
50%           1.000000
75%           1.000000
max           6.000000
Name: batsman_runs, dtype: float64
In [152]:
import numpy as np
Q1=cd['batsman_runs'].quantile(0.25)
Q3=cd['batsman_runs'].quantile(0.75)
IQR=Q3-Q1

lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")

median_value = cd['batsman_runs'].median()
print(f"Median value: {median_value}")

cd['batsman_runs']=np.where((cd['batsman_runs']< lower_bound) | (cd['batsman_runs']> Upper_bound),median_value,cd['batsman_runs'])
Lower Bound: -1.5
Upper Bound: 2.5
Median value: 1.0
In [153]:
cd.boxplot(column = ['batsman_runs'])
plt.title('Batsman_runs')
plt.show()
No description has been provided for this image
In [154]:
cd.shape
Out[154]:
(150460, 18)
In [155]:
cd['inning'].describe()
Out[155]:
count    150460.000000
mean          1.482188
std           0.501768
min           1.000000
25%           1.000000
50%           1.000000
75%           2.000000
max           4.000000
Name: inning, dtype: float64
In [156]:
import numpy as np
Q1=cd['inning'].quantile(0.25)
Q3=cd['inning'].quantile(0.75)
IQR=Q3-Q1

lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")

median_value = cd['inning'].median()
print(f"Median value: {median_value}")

cd['inning']=np.where((cd['inning']< lower_bound) | (cd['inning']> Upper_bound),median_value,cd['inning'])
Lower Bound: -0.5
Upper Bound: 3.5
Median value: 1.0
In [157]:
cd.boxplot(column = ['inning'])
plt.title('Innings')
plt.show()
No description has been provided for this image
In [158]:
cd.shape
Out[158]:
(150460, 18)
In [159]:
dd.boxplot(column = ['win_by_runs','win_by_wickets'])
plt.title('Finding any outliners')
plt.show()                                         
No description has been provided for this image
In [160]:
import numpy as np
Q1=dd['win_by_runs'].quantile(0.25)
Q3=dd['win_by_runs'].quantile(0.75)
IQR=Q3-Q1

lower_bound = Q1 - 1.5 *IQR
Upper_bound = Q3 + 1.5 *IQR

print(f"Lower Bound: {lower_bound}")
print(f"Upper Bound: {Upper_bound}")

median_value = dd['win_by_runs'].median()
print(f"Median value: {median_value}")

dd['win_by_runs']=np.where((dd['win_by_runs']< lower_bound) | (dd['win_by_runs']> Upper_bound),median_value,dd['win_by_runs'])
Lower Bound: -30.0
Upper Bound: 50.0
Median value: 0.0
In [161]:
dd.boxplot(column = ['win_by_runs'])
plt.title('Finding any outliners')
plt.show()                                         
No description has been provided for this image
In [162]:
dd.shape
Out[162]:
(636, 11)
In [163]:
cd['match_id'].unique()
Out[163]:
array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182,
       183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195,
       196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208,
       209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,
       222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
       235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247,
       248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260,
       261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273,
       274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286,
       287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299,
       300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312,
       313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325,
       326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338,
       339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
       352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
       365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
       378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390,
       391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403,
       404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416,
       417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429,
       430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442,
       443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455,
       456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468,
       469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 479, 480, 481,
       482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492, 493, 494,
       495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507,
       508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520,
       521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533,
       534, 535, 536, 537, 538, 539, 540, 541, 542, 543, 544, 545, 546,
       547, 548, 549, 550, 551, 552, 553, 554, 555, 556, 557, 558, 559,
       560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572,
       573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585,
       586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598,
       599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611,
       612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624,
       625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636],
      dtype=int64)
In [164]:
match_1 = cd[cd['match_id']==1]
In [165]:
match_1.head()
Out[165]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
0 1 1.0 SH RCB 1 1 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0.0 0 0.0
1 1 1.0 SH RCB 1 2 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0.0 0 0.0
2 1 1.0 SH RCB 1 3 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 1.0 0 1.0
3 1 1.0 SH RCB 1 4 DA Warner S Dhawan TS Mills 0 0 0 0 0 0 0.0 0 0.0
4 1 1.0 SH RCB 1 5 DA Warner S Dhawan TS Mills 0 2 0 0 0 0 0.0 2 2.0
In [166]:
match_1.shape
Out[166]:
(248, 18)
In [167]:
srh=match_1[match_1['inning']==1]
In [168]:
srh['batsman_runs'].value_counts()
Out[168]:
batsman_runs
1.0    84
0.0    32
2.0     9
Name: count, dtype: int64
In [169]:
rcb=match_1[match_1['inning']==2]
In [170]:
rcb['batsman_runs'].value_counts()
Out[170]:
batsman_runs
1.0    67
0.0    49
2.0     7
Name: count, dtype: int64

Visuallization¶

In [171]:
fig = pit.pie(data_frame=cd,names='over')
fig.show()       # Increase to decrease
In [73]:
fig = pit.pie(data_frame=cd,names='ball')
plt.title('Distribution of ball')
fig.show()
In [74]:
fig = pit.pie(data_frame=cd,names='wide_runs')
fig.show()
In [75]:
fig = pit.pie(data_frame=cd,names='bye_runs')
fig.show()
In [76]:
fig = pit.pie(data_frame=cd,names='legbye_runs')
fig.show()
In [77]:
fig = pit.pie(data_frame=cd,names='noball_runs')
fig.show()
In [78]:
fig = pit.pie(data_frame=cd,names='penalty_runs')
fig.show()
In [79]:
fig = pit.pie(data_frame=cd,names='extra_runs')
fig.show()
In [80]:
fig = pit.pie(data_frame=cd,names='total_runs')
fig.show()
In [81]:
cd['is_super_over']=cd['is_super_over'].map({0:'No super over',1:'Super over'})
cd
Out[81]:
match_id inning batting_team bowling_team over ball batsman non_striker bowler is_super_over wide_runs bye_runs legbye_runs noball_runs penalty_runs batsman_runs extra_runs total_runs
0 1 1.0 SH RCB 1 1 DA Warner S Dhawan TS Mills No super over 0 0 0 0 0 0.0 0 0.0
1 1 1.0 SH RCB 1 2 DA Warner S Dhawan TS Mills No super over 0 0 0 0 0 0.0 0 0.0
2 1 1.0 SH RCB 1 3 DA Warner S Dhawan TS Mills No super over 0 0 0 0 0 1.0 0 1.0
3 1 1.0 SH RCB 1 4 DA Warner S Dhawan TS Mills No super over 0 0 0 0 0 0.0 0 0.0
4 1 1.0 SH RCB 1 5 DA Warner S Dhawan TS Mills No super over 2 0 0 0 0 0.0 2 2.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
150455 636 2.0 RCB SH 20 2 Sachin Baby CJ Jordan B Kumar No super over 0 0 0 0 0 2.0 0 2.0
150456 636 2.0 RCB SH 20 3 Sachin Baby CJ Jordan B Kumar No super over 0 0 0 0 0 0.0 0 0.0
150457 636 2.0 RCB SH 20 4 Iqbal Abdulla Sachin Baby B Kumar No super over 0 0 1 0 0 0.0 1 1.0
150458 636 2.0 RCB SH 20 5 Sachin Baby Iqbal Abdulla B Kumar No super over 0 0 0 0 0 1.0 0 1.0
150459 636 2.0 RCB SH 20 6 Iqbal Abdulla Sachin Baby B Kumar No super over 0 0 0 0 0 1.0 0 1.0

150460 rows × 18 columns

In [83]:
plt.figure(figsize=(5,4))
sns.countplot(x='is_super_over',data=cd,palette=['Red','black'])
plt.title('Count plot')
plt.show()
# 150000 of instances or counts of match happened
No description has been provided for this image

SO THIS WE CAN SAY THERE IS NO SUPER OVER IN THE MATCH¶

In [85]:
plt.figure(figsize=(5,6))
sns.countplot(x='batting_team',data=cd)
plt.xticks(rotation=90)
plt.title('Count plot')
plt.show()
No description has been provided for this image
In [86]:
cd['batting_team'].value_counts()
Out[86]:
batting_team
MI      18943
RCB     17678
KXIP    17594
KKR     17229
DD      17185
CSK     15754
RR      13914
SH       9058
DC       9034
PW       5443
GL       3566
RPS      1900
KTK      1582
RPSS     1580
Name: count, dtype: int64
In [215]:
plt.figure(figsize=(5,6))
sns.countplot(x='bowling_team',data=cd)
plt.xticks(rotation=90)
plt.title('Count plot')
plt.show()
No description has been provided for this image
In [84]:
cd['bowling_team'].value_counts()
Out[84]:
bowling_team
MI      18879
RCB     17920
KKR     17411
KXIP    17392
DD      17099
CSK     15562
RR      14111
DC       9039
SH       8888
PW       5457
GL       3545
RPS      1928
RPSS     1615
KTK      1614
Name: count, dtype: int64
In [87]:
plt.figure(figsize = (10,5))
sns.barplot(x='batting_team',y ='total_runs', data = cd)
plt.xticks(rotation = 90)
plt.show()
No description has been provided for this image

THIS VISUALIZATION SHOW THE PLAYER WHO HAVE WON THE MOST PLAYER OF MATCH (SEASON (2008-2017))¶

In [88]:
dd['season'].value_counts()
Out[88]:
season
2013    76
2012    74
2011    73
2010    60
2014    60
2016    60
2017    59
2015    59
2008    58
2009    57
Name: count, dtype: int64
In [90]:
import plotly.express as px

season_counts = dd['season'].value_counts().reset_index()
season_counts.columns = ['season', 'count']

# Sort by season (not strictly necessary if data is already sorted)
season_counts = season_counts.sort_values('season')

# Plotting with Plotly
fig = px.bar(season_counts, 
             x='season', 
             y='count', 
             title='Number of matches played in each IPL season',
             labels={'season': 'Season', 'count': 'No. of matches'},
             template='plotly_dark',  # Example of using a dark theme
             color='season')  # Optional: Color bars by season

fig.update_layout(xaxis={'categoryorder':'category ascending'})  # Ensure x-axis is sorted

fig.show()

The IPL 2013 season had the highest number of matches played¶

In [93]:
plt.subplots(figsize=(10,6))
dd['toss_winner'].value_counts().plot.bar(width=0.8)
plt.show()
No description has been provided for this image
In [94]:
dd['toss_winner'].value_counts()
Out[94]:
toss_winner
MI      85
KKR     78
DD      72
RCB     70
KXIP    68
CSK     66
RR      63
DC      43
SH      35
PW      20
GL      15
KTK      8
RPSS     7
RPS      6
Name: count, dtype: int64

CHOICE OF TOSS IN DIFFERENT SEASONS¶

In [95]:
plt.subplots()
sns.countplot(x="season",hue="toss_decision",data=dd,palette=['Red','Green'])
plt.show()
No description has been provided for this image

Starting from the IPL-2016 season, there is a noticeable trend favoring the decision to field¶

In [96]:
plt.subplots(figsize=(10,6))
dd['winner'].value_counts().plot.bar(width=0.8)
plt.show()
No description has been provided for this image
In [97]:
dd['winner'].value_counts()
Out[97]:
winner
ML      92
CSK     79
KKR     78
RCB     73
KXIP    70
RR      64
DD      62
SH      42
DC      30
GL      13
PW      12
RPS     10
KTK      6
RPSS     5
Name: count, dtype: int64

INFERENCE¶

IPL T20 MATCH IN 2008 TO 2017¶

MAJOR INDICENTS¶

In July 2015, the Supreme Court of India set up a three-member committee led by former Chief Justice R M Lodha to investigate. The Justice Lodha Committee found evidence of match-fixing and betting, leading to a two-year ban for both Chennai Super Kings and Rajasthan Royals from the IPL in 2016 and 2017.¶

If this circumstances not happened, CSK may be the winner of match is most frequency¶

In [106]:
plt.figure(figsize=(5,4))
sns.countplot(x='player_of_match',data=dd)
plt.title('Most frequented')
plt.xticks(rotation=90)
plt.show()
No description has been provided for this image
In [107]:
dd['player_of_match'].value_counts()
Out[107]:
player_of_match
CH Gayle          18
YK Pathan         16
DA Warner         15
AB de Villiers    15
RG Sharma         14
                  ..
AD Mathews         1
LR Shukla          1
R Bhatia           1
A Singh            1
BCJ Cutting        1
Name: count, Length: 201, dtype: int64

TAKEN TOP 5 PLAYERS¶

In [108]:
dd['player_of_match'].value_counts()[0:5]
Out[108]:
player_of_match
CH Gayle          18
YK Pathan         16
DA Warner         15
AB de Villiers    15
RG Sharma         14
Name: count, dtype: int64

Chris Gayle has received the highest number of Man of the Match awards¶

Finding out the number of wins each team after batting first¶

In [109]:
batting_first=dd[dd['win_by_runs']!=0]
In [177]:
batting_first.head()
Out[177]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
0 2017 Hyderabad SH RCB RCB field normal SH 35.0 0 Yuvraj Singh
4 2017 Bangalore RCB DD RCB bat normal RCB 15.0 0 KM Jadhav
13 2017 Kolkata KKR SH SH field normal KKR 17.0 0 RV Uthappa
16 2017 Bangalore RPS RCB RCB field normal RPS 27.0 0 BA Stokes
18 2017 Hyderabad SH KXIP KXIP field normal SH 5.0 0 B Kumar
In [181]:
batting_first['win_by_runs'].shape
Out[181]:
(240,)
In [182]:
batting_first['win_by_runs'].unique()
Out[182]:
array([35., 15., 17., 27.,  5., 21., 14., 26.,  3., 48., 19., 12.,  7.,
        9., 10., 20.,  1., 33.,  6., 13., 45., 29., 18., 23., 41., 25.,
       11., 24., 38.,  8., 16.,  2.,  4., 31., 34., 36., 39., 40., 37.,
       22., 32., 43., 28., 42., 46., 47., 44., 30., 50.])

COMPARING 3 COLUMNS ARE TEAM 1 , TEAM 2 , WINNER TEAM AND WIN_BY_RUNS¶

In [110]:
batting_first.head()         
Out[110]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
0 2017 Hyderabad SH RCB RCB field normal SH 35.0 0 Yuvraj Singh
4 2017 Bangalore RCB DD RCB bat normal RCB 15.0 0 KM Jadhav
13 2017 Kolkata KKR SH SH field normal KKR 17.0 0 RV Uthappa
16 2017 Bangalore RPS RCB RCB field normal RPS 27.0 0 BA Stokes
18 2017 Hyderabad SH KXIP KXIP field normal SH 5.0 0 B Kumar
In [184]:
#MAKING A HISTROGRAM
plt.figure(figsize=(7,7))
plt.hist(batting_first['win_by_runs'])
plt.title('Distribution of runs')
plt.xlabel('Runs')
plt.show()                              # x=win_by_runs and y=counts as happens
No description has been provided for this image

COMPARING FIRST BATTING TEAM HAS CHOSEN, AND THE WINNER TEAM¶

In [172]:
plt.figure(figsize=(7,7))
plt.bar(list(batting_first['winner'].value_counts()[0:3].keys()),list(batting_first['winner'].value_counts()[0:3]),color=['blue','yellow','orange'])
plt.show()
No description has been provided for this image
In [173]:
plt.figure(figsize=(7,7))
plt.pie(list(batting_first['winner'].value_counts()),labels=list(batting_first['winner'].value_counts().keys()))
plt.show()
No description has been provided for this image

Finding out the number of wins each team after batting second¶

In [174]:
batting_second = dd[dd['win_by_wickets']!=0]
In [175]:
batting_second.head()
Out[175]:
season city team1 team2 toss_winner toss_decision result winner win_by_runs win_by_wickets player_of_match
1 2017 Pune MI RPS RPS field normal RPS 0.0 7 SPD Smith
2 2017 Rajkot GL KKR KKR field normal KKR 0.0 10 CA Lynn
3 2017 Indore RPS KXIP KXIP field normal KXIP 0.0 6 GJ Maxwell
5 2017 Hyderabad GL SH SH field normal SH 0.0 9 Rashid Khan
6 2017 Mumbai KKR MI MI field normal M 0.0 4 N Rana
In [176]:
batting_second['win_by_wickets'].unique()
Out[176]:
array([ 7, 10,  6,  9,  4,  8,  5,  2,  3,  1], dtype=int64)
In [13]:
plt.figure(figsize=(7,7))
plt.hist(batting_second['win_by_wickets'],bins=20)
plt.show()
No description has been provided for this image

we have a interesting histrogram here, 70 instances or matches were the batting second has 1 with 7 wickets in hand¶

It seems that if team batting second as won the match then most probabile it hood not have lost or lot of wickets¶

IT SEEMS LIKE NORMAL DISTRIBUTION , BUT IT IS DISCRETE¶

THE TEAM LOSE WICKETS , IT SEEMS LIKE IF THE TEAM BATTING SECOND WON THE MATCH THEN IT HOOD LOSE AROUND 4 TO 8 WICKETS¶

MORE THAN 8 WICKETS THEN THERE ARE VERY FEW CHANGES FOR THE TEAM TO WIN THE MATCH AFTER THAT.¶

EXACTLY 10 MATCHES OF 1 WITH 10 WICKETS REMAINING, WHERE THE TEAM BATTING SECOND HAS NOT EVEN LOST A SINGLE WICKET GONE TO DISMENT THEIR OPPENENT¶

Finding out the number of wins each team after batting second¶

In [243]:
plt.figure(figsize=(7,7))
plt.bar(list(batting_second['winner'].value_counts()[0:3].keys()),list(batting_second['winner'].value_counts()[0:3]),color=['purple','blue','red'])
plt.show()
No description has been provided for this image
In [244]:
batting_second['winner'].value_counts()
Out[244]:
winner
KKR     46
M       44
RCB     42
DD      41
RR      38
KXIP    36
CSK     33
SH      18
GL      12
DC      11
PW       6
RPS      5
KTK      4
RPSS     3
Name: count, dtype: int64
In [245]:
plt.figure(figsize=(7,7))
plt.pie(list(batting_second['winner'].value_counts()),labels=list(batting_second['winner'].value_counts().keys()))
plt.show()
No description has been provided for this image
In [247]:
#Finding out how many times team has won the match after winning the toss
num.sum(dd['toss_winner']==dd['winner'])
Out[247]:
325
In [248]:
325/636
Out[248]:
0.5110062893081762

51% of the times a team winning the toss also win the match so, it actually means that not much of effect on toss.¶

its more like a equally(50-50) neither its heads or tails so toss may be afftect the match that much¶

HYPOTHESIS TEST¶

CHI SQUARE TESTING¶

In [249]:
from scipy.stats import chi2_contingency

contingency_table = pd.crosstab(dd["city"],dd["winner"])

chi2_result = chi2_contingency(contingency_table,correction = False)
print("chi square result:",chi2_result)
chi square result: Chi2ContingencyResult(statistic=1439.865131584083, pvalue=8.258671120736886e-124, dof=377, expected_freq=array([[ 0.86102236,  0.32428115,  0.68210863,  0.14536741,  0.86102236,
         0.06709265,  0.7715655 ,  1.02875399,  0.1341853 ,  0.80511182,
         0.11182109,  0.05591054,  0.70447284,  0.44728435],
       [ 1.47603834,  0.55591054,  1.16932907,  0.24920128,  1.47603834,
         0.11501597,  1.32268371,  1.76357827,  0.23003195,  1.38019169,
         0.19169329,  0.09584665,  1.20766773,  0.76677316],
       [ 7.87220447,  2.96485623,  6.23642173,  1.32907348,  7.87220447,
         0.61341853,  7.0543131 ,  9.4057508 ,  1.22683706,  7.36102236,
         1.02236422,  0.51118211,  6.44089457,  4.08945687],
       [ 0.24600639,  0.09265176,  0.19488818,  0.04153355,  0.24600639,
         0.01916933,  0.22044728,  0.29392971,  0.03833866,  0.23003195,
         0.03194888,  0.01597444,  0.20127796,  0.12779553],
       [ 0.86102236,  0.32428115,  0.68210863,  0.14536741,  0.86102236,
         0.06709265,  0.7715655 ,  1.02875399,  0.1341853 ,  0.80511182,
         0.11182109,  0.05591054,  0.70447284,  0.44728435],
       [ 1.47603834,  0.55591054,  1.16932907,  0.24920128,  1.47603834,
         0.11501597,  1.32268371,  1.76357827,  0.23003195,  1.38019169,
         0.19169329,  0.09584665,  1.20766773,  0.76677316],
       [ 5.65814696,  2.13099042,  4.48242812,  0.95527157,  5.65814696,
         0.44089457,  5.07028754,  6.76038339,  0.88178914,  5.29073482,
         0.73482428,  0.36741214,  4.62939297,  2.93929712],
       [ 5.90415335,  2.22364217,  4.67731629,  0.99680511,  5.90415335,
         0.4600639 ,  5.29073482,  7.0543131 ,  0.9201278 ,  5.52076677,
         0.76677316,  0.38338658,  4.83067093,  3.06709265],
       [ 0.86102236,  0.32428115,  0.68210863,  0.14536741,  0.86102236,
         0.06709265,  0.7715655 ,  1.02875399,  0.1341853 ,  0.80511182,
         0.11182109,  0.05591054,  0.70447284,  0.44728435],
       [ 7.2571885 ,  2.73322684,  5.74920128,  1.22523962,  7.2571885 ,
         0.56549521,  6.50319489,  8.67092652,  1.13099042,  6.78594249,
         0.94249201,  0.47124601,  5.93769968,  3.76996805],
       [ 1.10702875,  0.41693291,  0.87699681,  0.18690096,  1.10702875,
         0.08626198,  0.99201278,  1.32268371,  0.17252396,  1.03514377,
         0.14376997,  0.07188498,  0.9057508 ,  0.57507987],
       [ 1.84504792,  0.69488818,  1.46166134,  0.3115016 ,  1.84504792,
         0.14376997,  1.65335463,  2.20447284,  0.28753994,  1.72523962,
         0.23961661,  0.11980831,  1.50958466,  0.95846645],
       [ 0.36900958,  0.13897764,  0.29233227,  0.06230032,  0.36900958,
         0.02875399,  0.33067093,  0.44089457,  0.05750799,  0.34504792,
         0.04792332,  0.02396166,  0.30191693,  0.19169329],
       [ 6.02715655,  2.26996805,  4.77476038,  1.01757188,  6.02715655,
         0.46964856,  5.40095847,  7.20127796,  0.93929712,  5.63578275,
         0.7827476 ,  0.3913738 ,  4.9313099 ,  3.13099042],
       [ 0.61501597,  0.23162939,  0.48722045,  0.10383387,  0.61501597,
         0.04792332,  0.55111821,  0.73482428,  0.09584665,  0.57507987,
         0.0798722 ,  0.0399361 ,  0.50319489,  0.31948882],
       [ 4.05910543,  1.52875399,  3.21565495,  0.68530351,  4.05910543,
         0.31629393,  3.63738019,  4.84984026,  0.63258786,  3.79552716,
         0.52715655,  0.26357827,  3.32108626,  2.1086262 ],
       [ 0.98402556,  0.37060703,  0.77955272,  0.16613419,  0.98402556,
         0.07667732,  0.88178914,  1.17571885,  0.15335463,  0.9201278 ,
         0.12779553,  0.06389776,  0.80511182,  0.51118211],
       [ 0.49201278,  0.18530351,  0.38977636,  0.08306709,  0.49201278,
         0.03833866,  0.44089457,  0.58785942,  0.07667732,  0.4600639 ,
         0.06389776,  0.03194888,  0.40255591,  0.25559105],
       [ 0.36900958,  0.13897764,  0.29233227,  0.06230032,  0.36900958,
         0.02875399,  0.33067093,  0.44089457,  0.05750799,  0.34504792,
         0.04792332,  0.02396166,  0.30191693,  0.19169329],
       [ 0.61501597,  0.23162939,  0.48722045,  0.10383387,  0.61501597,
         0.04792332,  0.55111821,  0.73482428,  0.09584665,  0.57507987,
         0.0798722 ,  0.0399361 ,  0.50319489,  0.31948882],
       [ 7.50319489,  2.82587859,  5.94408946,  1.26677316,  7.50319489,
         0.58466454,  6.72364217,  8.96485623,  1.16932907,  7.01597444,
         0.97444089,  0.48722045,  6.13897764,  3.89776358],
       [10.45527157,  3.93769968,  8.2827476 ,  1.76517572, 10.45527157,
         0.81469649,  9.36900958, 12.49201278,  1.62939297,  9.77635783,
         1.35782748,  0.67891374,  8.5543131 ,  5.4313099 ],
       [ 0.36900958,  0.13897764,  0.29233227,  0.06230032,  0.36900958,
         0.02875399,  0.33067093,  0.44089457,  0.05750799,  0.34504792,
         0.04792332,  0.02396166,  0.30191693,  0.19169329],
       [ 0.86102236,  0.32428115,  0.68210863,  0.14536741,  0.86102236,
         0.06709265,  0.7715655 ,  1.02875399,  0.1341853 ,  0.80511182,
         0.11182109,  0.05591054,  0.70447284,  0.44728435],
       [ 3.93610224,  1.48242812,  3.11821086,  0.66453674,  3.93610224,
         0.30670927,  3.52715655,  4.7028754 ,  0.61341853,  3.68051118,
         0.51118211,  0.25559105,  3.22044728,  2.04472843],
       [ 0.73801917,  0.27795527,  0.58466454,  0.12460064,  0.73801917,
         0.05750799,  0.66134185,  0.88178914,  0.11501597,  0.69009585,
         0.09584665,  0.04792332,  0.60383387,  0.38338658],
       [ 1.23003195,  0.46325879,  0.97444089,  0.20766773,  1.23003195,
         0.09584665,  1.10223642,  1.46964856,  0.19169329,  1.15015974,
         0.15974441,  0.0798722 ,  1.00638978,  0.63897764],
       [ 0.86102236,  0.32428115,  0.68210863,  0.14536741,  0.86102236,
         0.06709265,  0.7715655 ,  1.02875399,  0.1341853 ,  0.80511182,
         0.11182109,  0.05591054,  0.70447284,  0.44728435],
       [ 0.73801917,  0.27795527,  0.58466454,  0.12460064,  0.73801917,
         0.05750799,  0.66134185,  0.88178914,  0.11501597,  0.69009585,
         0.09584665,  0.04792332,  0.60383387,  0.38338658],
       [ 1.35303514,  0.50958466,  1.07188498,  0.2284345 ,  1.35303514,
         0.10543131,  1.21246006,  1.61661342,  0.21086262,  1.26517572,
         0.17571885,  0.08785942,  1.10702875,  0.7028754 ]]))
In [250]:
p_value = 8.258671120736886e-124
alpha = 0.05
if p_value<alpha:
    print("Reject the null hypothesis. There is a link")
else:
    print("Fail to reject the null hypothesis")
Reject the null hypothesis. There is a link
In [ ]: